TL;DR

The zinb model doesn’t seem to work well in the Fluidigm data. This is independent of normalization, meaning that both unnormalized data and normalized data (both with scran and full-quantile) lead to bad results.

PCA leads to better results, even though, as many people have shown, the first component is highly correlated to detection rate, even after normalization.

ZIFA seems to work well in the Fluidigm datasets, suggesting that we should be able to find a way to make zinb work as well.

One key difference between ZIFA and zinb is that in our model both \(\mu\) and \(\pi\) depend on \(W\), while in the ZIFA model only \(\mu\) does. We should consider a model that doesn’t have a \(W\) in \(\pi\).

It seems (preliminary analyses) that removing the intercept in V makes the weird W behavior go away. Still, I think that we should add the ability to remove W from \(\pi\) and see what is the difference.

High coverage data

Select high coverage cells, filter out the genes that do not have at least 10 counts in at least 10 cells.

data("fluidigm")
fluidigm_high <- fluidigm[,which(colData(fluidigm)$Coverage_Type=="High")]
filter <- apply(assay(fluidigm_high)>10, 1, sum)>=10

Number of retained genes:

print(sum(filter))
## [1] 6998

No normalization

Fit a zinb model on the high coverage data (no normalization).

fluidigm_high <- fluidigm_high[filter,]
high <- assay(fluidigm_high)
set.seed(23424)
print(system.time(zinb_high <- zinbFit(high, ncores = 3, K = 2)))
##    user  system elapsed 
## 670.571  72.622 296.439

Plot the results with cells colored according to their biological condition.

bio <- as.factor(colData(fluidigm_high)$Biological_Condition)
cl <- as.factor(colData(fluidigm_high)$Cluster2)

plot(zinb_high@W, col=cols[bio], pch=19, xlab="W1", ylab="W2")
legend("topleft", levels(bio), fill=cols)

plot(zinb_high@W, col=cols2[cl], pch=19, xlab="W1", ylab="W2")
legend("topleft", levels(cl), fill=cols2)

Compared to PCA (of the log counts)

pca <- prcomp(t(log1p(high)))
plot(pca$x, col=cols[bio], pch=19)
legend("topleft", levels(bio), fill=cols)

plot(pca$x, col=cols2[cl], pch=19)
legend("topleft", levels(cl), fill=cols2)

Check the correlations between two zinb factors, the total numbers of reads and the total numbers of detected genes.

#first factor and total number of detected genes in the cell
cor(zinb_high@W[,1], colSums(high>0),method="spearman")
## [1] -0.2168706
#second factor and the total number of detected genes in the cell
cor(zinb_high@W[,2], colSums(high>0),method="spearman")
## [1] 0.5244318
#first factor and the total number of counts in the cell
cor(zinb_high@W[,1], colSums(high),method="spearman")
## [1] -0.1518794
#second factor and the total number of counts in the cell
cor(zinb_high@W[,2], colSums(high),method="spearman")
## [1] 0.3468531

Same for PCA

#first factor and total number of detected genes in the cell
cor(pca$x[,1], colSums(high>0), method="spearman")
## [1] 0.9232517
#second factor and the total number of detected genes in the cell
cor(pca$x[,2], colSums(high>0), method="spearman")
## [1] 0.297028
#first factor and the total number of counts in the cell
cor(pca$x[,1], colSums(high), method="spearman")
## [1] 0.1270105
#second factor and the total number of counts in the cell
cor(pca$x[,2], colSums(high), method="spearman")
## [1] -0.3141171

Correlation between PCA and ZINB

cor(pca$x[,1], zinb_high@W[,1], method="spearman")
## [1] -0.2783654
cor(pca$x[,2], zinb_high@W[,1], method="spearman")
## [1] -0.01188811
cor(pca$x[,1], zinb_high@W[,2], method="spearman")
## [1] 0.7553322
cor(pca$x[,2], zinb_high@W[,2], method="spearman")
## [1] -0.4728147

Using scran size factors

sce <- newSCESet(countData=data.frame(high))
sce <- computeSumFactors(sce, sizes=c(10, 20, 30))

sf <- sizeFactors(sce)

norm <- exprs(convertTo(sce, type="monocle"))
fq <- betweenLaneNormalization(high, which="full")

plotRLE(high, outline=FALSE, col=cols[bio], main="Unnormalized counts")

plotRLE(norm, outline=FALSE, col=cols[bio], main="SCRAN normalization")

plotRLE(fq, outline=FALSE, col=cols[bio], main="FQ normalization")

I’m not convinced that this is the best normalization, but since it’s the only one specifically proposed for scRNA-seq, it’s a good place to start.

set.seed(3525)
offsets <- matrix(rep(sf, NROW(high)), ncol=NROW(high), nrow=NCOL(high))
print(system.time(zinb_norm <- zinbFit(high, ncores = 3, K = 2, O_mu = offsets)))
##    user  system elapsed 
## 748.390  80.517 318.133

Plot the results with cells colored according to their biological condition.

plot(zinb_norm@W, col=cols[bio], pch=19, xlab="W1", ylab="W2")
legend("topleft", levels(bio), fill=cols)

plot(zinb_norm@W, col=cols2[cl], pch=19, xlab="W1", ylab="W2")
legend("topleft", levels(cl), fill=cols2)

qc <- as.matrix(colData(fluidigm_high)[,metadata(fluidigm_high)$which_qc])
qcpca <- prcomp(qc, scale=TRUE)

quality <- qcpca$x[,1]
detection_rate <- colSums(high>0)

data.frame(W1=zinb_norm@W[,1], W2=zinb_norm@W[,2]) %>% ggplot(aes(W1, W2)) + geom_point(aes(color=detection_rate)) + scale_colour_gradient(low="blue", high="yellow") + theme_classic()

Compared to PCA (of the log counts)

pca_norm <- prcomp(t(log1p(norm)))
plot(pca_norm$x, col=cols[bio], pch=19)
legend("bottomleft", levels(bio), fill=cols)

plot(pca_norm$x, col=cols2[cl], pch=19)
legend("topleft", levels(cl), fill=cols2)

Check the correlations between two zinb factors, the total numbers of reads and the total numbers of detected genes.

#first factor and total number of detected genes in the cell
cor(zinb_norm@W[,1], colSums(high>0),method="spearman")
## [1] -0.2240385
#second factor and the total number of detected genes in the cell
cor(zinb_norm@W[,2], colSums(high>0),method="spearman")
## [1] 0.4842657
#first factor and the total number of counts in the cell
cor(zinb_norm@W[,1], colSums(high),method="spearman")
## [1] -0.1604021
#second factor and the total number of counts in the cell
cor(zinb_norm@W[,2], colSums(high),method="spearman")
## [1] 0.3326923

Same for PCA

#first factor and total number of detected genes in the cell
cor(pca_norm$x[,1], colSums(high>0), method="spearman")
## [1] 0.9566434
#second factor and the total number of detected genes in the cell
cor(pca_norm$x[,2], colSums(high>0), method="spearman")
## [1] -0.1056818
#first factor and the total number of counts in the cell
cor(pca_norm$x[,1], colSums(high), method="spearman")
## [1] -0.07137238
#second factor and the total number of counts in the cell
cor(pca_norm$x[,2], colSums(high), method="spearman")
## [1] 0.4917832

Correlation between PCA and ZINB

cor(pca_norm$x[,1], zinb_norm@W[,1], method="spearman")
## [1] -0.2262675
cor(pca_norm$x[,2], zinb_norm@W[,1], method="spearman")
## [1] 0.004195804
cor(pca_norm$x[,1], zinb_norm@W[,2], method="spearman")
## [1] 0.6608392
cor(pca_norm$x[,2], zinb_norm@W[,2], method="spearman")
## [1] 0.6752622

Using FQ derived offsets

se <- newSeqExpressionSet(high)
se <- betweenLaneNormalization(se, which="full", offset=TRUE)
offsets <- t(offst(se))
set.seed(35235)
print(system.time(zinb_fq <- zinbFit(high, ncores = 3, K = 2, O_mu = offsets)))
##    user  system elapsed 
## 841.972  87.240 353.576

Plot the results with cells colored according to their biological condition.

plot(zinb_fq@W, col=cols[bio], pch=19, xlab="W1", ylab="W2")
legend("topleft", levels(bio), fill=cols)

plot(zinb_fq@W, col=cols2[cl], pch=19, xlab="W1", ylab="W2")
legend("topleft", levels(cl), fill=cols2)

data.frame(W1=zinb_fq@W[,1], W2=zinb_fq@W[,2]) %>% ggplot(aes(W1, W2)) + geom_point(aes(color=detection_rate)) + scale_colour_gradient(low="blue", high="yellow") + theme_classic()

Compared to PCA (of the log counts)

pca_fq <- prcomp(t(log1p(fq)))
plot(pca_fq$x, col=cols[bio], pch=19)
legend("topright", levels(bio), fill=cols)

plot(pca_fq$x, col=cols2[cl], pch=19)
legend("topright", levels(cl), fill=cols2)

Check the correlations between two zinb factors, the total numbers of reads and the total numbers of detected genes.

#first factor and total number of detected genes in the cell
cor(zinb_fq@W[,1], colSums(high>0),method="spearman")
## [1] 0.08378497
#second factor and the total number of detected genes in the cell
cor(zinb_fq@W[,2], colSums(high>0),method="spearman")
## [1] 0.3754808
#first factor and the total number of counts in the cell
cor(zinb_fq@W[,1], colSums(high),method="spearman")
## [1] -0.1039336
#second factor and the total number of counts in the cell
cor(zinb_fq@W[,2], colSums(high),method="spearman")
## [1] 0.3515297

Same for PCA

#first factor and total number of detected genes in the cell
cor(pca_fq$x[,1], colSums(high>0), method="spearman")
## [1] 0.7178322
#second factor and the total number of detected genes in the cell
cor(pca_fq$x[,2], colSums(high>0), method="spearman")
## [1] 0.6338724
#first factor and the total number of counts in the cell
cor(pca_fq$x[,1], colSums(high), method="spearman")
## [1] 0.2144231
#second factor and the total number of counts in the cell
cor(pca_fq$x[,2], colSums(high), method="spearman")
## [1] -0.2588287

Correlation between PCA and ZINB

cor(pca_fq$x[,1], zinb_fq@W[,1], method="spearman")
## [1] -0.002097902
cor(pca_fq$x[,2], zinb_fq@W[,1], method="spearman")
## [1] 0.1466783
cor(pca_fq$x[,1], zinb_fq@W[,2], method="spearman")
## [1] 0.7437937
cor(pca_fq$x[,2], zinb_fq@W[,2], method="spearman")
## [1] -0.1988199
## write matrices to file to feed to ZIFA in python
write.csv(log1p(high), file="logcounts_high.csv")
write.csv(log1p(norm), file="logcounts_high_scran.csv")

Low coverage data

Select low coverage cells, filter out the genes that do not have at least 10 counts in at least 10 cells.

fluidigm_low <- fluidigm[,which(colData(fluidigm)$Coverage_Type=="Low")]
filter <- apply(assay(fluidigm_low)>10, 1, sum)>=10

Number of retained genes:

print(sum(filter))
## [1] 6998

No normalization

Fit a zinb model on the low coverage data (no normalization).

fluidigm_low <- fluidigm_low[filter,]
low <- assay(fluidigm_low)
set.seed(654)
print(system.time(zinb_low <- zinbFit(low, ncores = 3, K = 2)))
##    user  system elapsed 
## 379.503  60.009 163.815

Plot the results with cells colored according to their biological condition.

bio <- as.factor(colData(fluidigm_low)$Biological_Condition)
cl <- as.factor(colData(fluidigm_low)$Cluster2)

plot(zinb_low@W, col=cols[bio], pch=19, xlab="W1", ylab="W2")
legend("topleft", levels(bio), fill=cols)

plot(zinb_low@W, col=cols2[cl], pch=19, xlab="W1", ylab="W2")
legend("topleft", levels(cl), fill=cols2)

Compared to PCA (of the log counts)

pca <- prcomp(t(log1p(low)))
plot(pca$x, col=cols[bio], pch=19)
legend("topleft", levels(bio), fill=cols)

plot(pca$x, col=cols2[cl], pch=19)
legend("topleft", levels(cl), fill=cols2)

Check the correlations between two zinb factors, the total numbers of reads and the total numbers of detected genes.

#first factor and total number of detected genes in the cell
cor(zinb_low@W[,1], colSums(low>0),method="spearman")
## [1] 0.4277894
#second factor and the total number of detected genes in the cell
cor(zinb_low@W[,2], colSums(low>0),method="spearman")
## [1] -0.5360534
#first factor and the total number of counts in the cell
cor(zinb_low@W[,1], colSums(low),method="spearman")
## [1] 0.0005681818
#second factor and the total number of counts in the cell
cor(zinb_low@W[,2], colSums(low),method="spearman")
## [1] 0.2904283

Same for PCA

#first factor and total number of detected genes in the cell
cor(pca$x[,1], colSums(low>0), method="spearman")
## [1] -0.6268781
#second factor and the total number of detected genes in the cell
cor(pca$x[,2], colSums(low>0), method="spearman")
## [1] -0.03942437
#first factor and the total number of counts in the cell
cor(pca$x[,1], colSums(low), method="spearman")
## [1] 0.06979895
#second factor and the total number of counts in the cell
cor(pca$x[,2], colSums(low), method="spearman")
## [1] -0.474257

Correlation between PCA and ZINB

cor(pca$x[,1], zinb_low@W[,1], method="spearman")
## [1] -0.8020979
cor(pca$x[,2], zinb_low@W[,1], method="spearman")
## [1] -0.06770105
cor(pca$x[,1], zinb_low@W[,2], method="spearman")
## [1] 0.8125874
cor(pca$x[,2], zinb_low@W[,2], method="spearman")
## [1] -0.4075612

Using scran size factors

sce <- newSCESet(countData=data.frame(low))
sce <- computeSumFactors(sce, sizes=c(10, 20, 30))

sf <- sizeFactors(sce)

norm <- exprs(convertTo(sce, type="monocle"))

plotRLE(low, outline=FALSE, col=cols[bio], main="Unnormalized counts")

plotRLE(norm, outline=FALSE, col=cols[bio], main="SCRAN normalization")

set.seed(3525)
offsets <- matrix(rep(sf, NROW(low)), ncol=NROW(low), nrow=NCOL(low))
print(system.time(zinb_norm <- zinbFit(low, ncores = 3, K = 2, O_mu = offsets)))
##    user  system elapsed 
## 266.052  45.152 117.459

Plot the results with cells colored according to their biological condition.

plot(zinb_norm@W, col=cols[bio], pch=19, xlab="W1", ylab="W2")
legend("topleft", levels(bio), fill=cols)

plot(zinb_norm@W, col=cols2[cl], pch=19, xlab="W1", ylab="W2")
legend("topleft", levels(cl), fill=cols2)

qc <- as.matrix(colData(fluidigm_low)[,metadata(fluidigm_low)$which_qc])
qcpca <- prcomp(qc, scale=TRUE)

quality <- qcpca$x[,1]
detection_rate <- colSums(low>0)

data.frame(W1=zinb_norm@W[,1], W2=zinb_norm@W[,2]) %>% ggplot(aes(W1, W2)) + geom_point(aes(color=detection_rate)) + scale_colour_gradient(low="blue", high="yellow") + theme_classic()

Compared to PCA (of the log counts)

pca_norm <- prcomp(t(log1p(norm)))
plot(pca_norm$x, col=cols[bio], pch=19)
legend("bottomleft", levels(bio), fill=cols)

plot(pca_norm$x, col=cols2[cl], pch=19)
legend("topleft", levels(cl), fill=cols2)

Check the correlations between two zinb factors, the total numbers of reads and the total numbers of detected genes.

#first factor and total number of detected genes in the cell
cor(zinb_norm@W[,1], colSums(low>0),method="spearman")
## [1] 0.02917491
#second factor and the total number of detected genes in the cell
cor(zinb_norm@W[,2], colSums(low>0),method="spearman")
## [1] -0.5416043
#first factor and the total number of counts in the cell
cor(zinb_norm@W[,1], colSums(low),method="spearman")
## [1] -0.03199301
#second factor and the total number of counts in the cell
cor(zinb_norm@W[,2], colSums(low),method="spearman")
## [1] 0.1593969

Same for PCA

#first factor and total number of detected genes in the cell
cor(pca_norm$x[,1], colSums(low>0), method="spearman")
## [1] -0.8312772
#second factor and the total number of detected genes in the cell
cor(pca_norm$x[,2], colSums(low>0), method="spearman")
## [1] 0.5694024
#first factor and the total number of counts in the cell
cor(pca_norm$x[,1], colSums(low), method="spearman")
## [1] 0.4491259
#second factor and the total number of counts in the cell
cor(pca_norm$x[,2], colSums(low), method="spearman")
## [1] -0.5332168

Correlation between PCA and ZINB

cor(pca_norm$x[,1], zinb_norm@W[,1], method="spearman")
## [1] 0.2093969
cor(pca_norm$x[,2], zinb_norm@W[,1], method="spearman")
## [1] 0.4050699
cor(pca_norm$x[,1], zinb_norm@W[,2], method="spearman")
## [1] 0.859965
cor(pca_norm$x[,2], zinb_norm@W[,2], method="spearman")
## [1] 0.1836538

Include detection rate as covariates

x <- model.matrix(~scale(detection_rate))
set.seed(9948)
print(system.time(zinb_norm <- zinbFit(low, ncores = 3, K = 2, X = x, which_X_mu=1:2, which_X_pi=1L)))
##    user  system elapsed 
## 381.644  47.689 170.539
plot(zinb_norm@W, col=cols[bio], pch=19, xlab="W1", ylab="W2")
legend("topleft", levels(bio), fill=cols)

plot(zinb_norm@W, col=cols2[cl], pch=19, xlab="W1", ylab="W2")
legend("topleft", levels(cl), fill=cols2)

data.frame(W1=zinb_norm@W[,1], W2=zinb_norm@W[,2]) %>% ggplot(aes(W1, W2)) + geom_point(aes(color=detection_rate)) + scale_colour_gradient(low="blue", high="yellow") + theme_classic()

## write matrices to file to feed to ZIFA in python
write.csv(log1p(low), file="logcounts_low.csv")
write.csv(log1p(norm), file="logcounts_low_scran.csv")
write.csv(log1p(low[1:5, 1:3]), file="logcounts_toy.csv")

ZIFA (low coverage, unnormalized data)

zifa_res <- read.csv("zifa_low.csv", header=FALSE)

plot(zifa_res, col=cols[bio], pch=19, xlab="W1", ylab="W2")
legend("topleft", levels(bio), fill=cols)

Adding \(V_\pi\)

library(biomaRt)
mart <- useMart("ensembl")
mart <- useDataset("hsapiens_gene_ensembl", mart = mart)
attrs <- c("hgnc_symbol", "entrezgene")
bm <- getBM(attributes=attrs, mart = mart)
bm <- bm[match(rownames(low), bm[,1]),]
bm <- na.omit(bm)
low <- low[bm[,1],]

gene_info <- getGeneLengthAndGCContent(as.character(bm[,2]), "hsa", mode="org.db")
rownames(gene_info) <- bm[,1]

gene_info <- na.omit(gene_info)
low <- low[rownames(gene_info),]

NB: currently, there is no way to have no columns of X or V in either \(\mu\) or \(\pi\). We should let the user specify NULL?

V <- cbind(rep(0, NROW(low)), gene_info)
print(system.time(zinb_gc <- zinbFit(low, ncores = 3, K = 2, V = V, which_V_mu=1L, which_V_pi=2:3)))
##    user  system elapsed 
## 194.554  23.157  88.511
plot(zinb_gc@W, col=cols[bio], pch=19, xlab="W1", ylab="W2")
legend("bottomright", levels(bio), fill=cols)

plot(zinb_gc@W, col=cols2[cl], pch=19, xlab="W1", ylab="W2")
legend("bottomright", levels(cl), fill=cols2)

Removing V intercept

print(system.time(zinb_nov <- zinbFit(low, ncores = 3, K = 2, V=matrix(0, ncol=1, nrow=NROW(low)))))
##    user  system elapsed 
## 300.793  56.159 143.995
plot(zinb_nov@W, col=cols[bio], pch=19, xlab="W1", ylab="W2")
legend("bottomright", levels(bio), fill=cols)

plot(zinb_nov@W, col=cols2[cl], pch=19, xlab="W1", ylab="W2")
legend("bottomright", levels(cl), fill=cols2)

Keeping it for \(\pi\)

V <- cbind(rep(0, NROW(low)), rep(1, NROW(low)))
X <- cbind(rep(0, NCOL(low)), rep(1, NCOL(low)))
print(system.time(zinb_vx <- zinbFit(low, ncores = 3, K = 2, V=V, X=X, which_V_mu=1L, which_V_pi=2L, which_X_mu=2L, which_X_pi=1L)))
##    user  system elapsed 
## 280.548  36.245 126.151
plot(zinb_vx@W, col=cols[bio], pch=19, xlab="W1", ylab="W2")
legend("bottomleft", levels(bio), fill=cols)